## Programmer: Vito D'Orazio
## select_sequences_ucdp_111914.r
## Script to select sequences using the UCDP data for conflict onset
## inputs: events_GTDS_old.dta, 124920_1ucdpprioarmedconflictdataset4.2014.rdata
## outputs: select_sequences_ucdp_111914.RData

rm(list=ls())
library(foreign)
library(zoo)
library(countrycode)

setwd("/Users/vjdorazio/Desktop/Sequence_Analysis_paper/replication_data")

mydata <- read.dta("events_GTDS_old.dta")
load("/Users/vjdorazio/Desktop/Sequence_Analysis_paper/plos_version/data/124920_1ucdpprioarmedconflictdataset4.2014.rdata")

mydata$date_str[which(nchar(mydata$date_str)==6)] <- paste(substr(mydata$date_str[which(nchar(mydata$date_str)==6)], 1,5), "0", substr(mydata$date_str[which(nchar(mydata$date_str)==6)], 6,6), sep="")
mydata <- mydata[order(mydata$COUNTRY,mydata$date_str),]

##### UCDP data formatting
#civil conflicts
ucdpConflict <- ucdpConflict[which(ucdpConflict$Type==3 | ucdpConflict$Type==4),]

# converting date to 1998w1 style format from 1998-01-01 format
ucdpdate <- strptime(ucdpConflict$StartDate2, "%Y-%m-%d")
ucdpConflict$date_str <- format(ucdpdate, "%Yw%W")

ucdpenddate <- strptime(ucdpConflict$EpEndDate, "%Y-%m-%d")
ucdpConflict$end_date_str <- format(ucdpenddate, "%Yw%W")


# country name conversion
ucdpConflict$COUNTRY <- substr(ucdpConflict$SideA, 15, 100)
ucdpConflict$COUNTRY <- countrycode(ucdpConflict$COUNTRY, origin="country.name", destination="iso3c")

# drop duplicates for multiple onsets in a week
ucdpConflict <- ucdpConflict[which(!duplicated(cbind(ucdpConflict$COUTNRY, ucdpConflict$date_str))),]

# sorting
ucdpConflict$date_str[which(nchar(ucdpConflict$date_str)==6)] <- paste(substr(ucdpConflict$date_str[which(nchar(ucdpConflict$date_str)==6)], 1,5), "0", substr(ucdpConflict$date_str[which(nchar(ucdpConflict$date_str)==6)], 6,6), sep="")
ucdpConflict <- ucdpConflict[order(ucdpConflict$COUNTRY,ucdpConflict$date_str),]

# and merge
mydata <- merge(x=mydata, y=ucdpConflict[,c("COUNTRY", "date_str", "ConflictId", "end_date_str")], by=c("COUNTRY", "date_str"), all.x=TRUE)

# code ucdp onset
mydata$ucdpconflict <- 0
mydata$ucdpconflict[which(!is.na(mydata$ConflictId))] <- 1

# coding conflict episodes at the weekly level.  after onset, code as 1 until end date.  if no end date, conflict episode lasts the rest of the year.  this is really only important for extracting valid peace sequences.

episode <- FALSE
for(i in 1:nrow(mydata)) {
    if(mydata$ucdpconflict[i]==1) {
        episode <- TRUE
        temp.end <- mydata$end_date_str[i]
        if(is.na(temp.end)) {
            year <- substr(mydata$date_str[i], 1, 4)
            temp.end <- paste(year, "w52", sep="")
        }
        next
    }
    
    if(episode==TRUE) {
        if(mydata$date_str[i]==temp.end) {
            mydata$ucdpconflict[i] <- 1
            episode <- FALSE
            next
        } else {
            mydata$ucdpconflict[i] <- 1
            next
        }
    }
    
    mydata$ucdpconflict[i] <- 0
}


## SECTION 2.a ## SECTION 2.a ## SECTION 2.a ## SECTION 2.a ##
##############################################################
## ONSET AND POTENTIAL PEACE #### ONSET AND POTENTIAL PEACE ##

## recycled code, don't mind the "reb" tags, it's UCDP
gtds.reb <- mydata$ucdpconflict
reb.potpeace <- vector(mode = "integer", length = nrow(mydata))
reb.onset <- vector(mode = "integer", length = nrow(mydata))

reb.count <- 0

for(i in 1:nrow(mydata)) {

    # resetting all counts when we index to a new country
    if(i>1 && mydata$COUNTRY[i] != mydata$COUNTRY[i-1]) {
        reb.count <- 0
        next
    }

    # rebellion
    if (gtds.reb[i] == 0) { # no GTDS conflict
        reb.count <- reb.count+1
        if(reb.count > 12) {
            reb.potpeace[i] <- 1
        }
    }
    if (gtds.reb[i] == 1) { # yes GTDS conflict
        if(reb.count >= 12) {
            reb.onset[i] <- 1
        }
        reb.count <- 0
    }
} # end for loop

mydata <- cbind(mydata,reb.onset,reb.potpeace)


## SECTION 2.b ## SECTION 2.b ## SECTION 2.b #####
##################################################
## SELECTING SEQUENCES #### SELECTING SEQUENCES ##

# Subsetting out all states that never experience conflict

# properly sorting the data
mydata <- mydata[order(mydata$COUNTRY,mydata$date_str),]

## The following section of code subsets mydata to only include the states
## that experience at least one observation of conflict onset
keep.state <- vector(mode="character",length = length(unique(mydata$COUNTRY)))
temp.data <- matrix(data=NA, nrow=0,ncol=ncol(mydata))

# int.onset is not included because in our study we are forecasting domestic violence
keep.data <- mydata[which(mydata$reb.onset == 1 ),]
keep.data$COUNTRY <- as.vector(keep.data$COUNTRY)
keep.state <- unique(keep.data$COUNTRY)
keep.state <- as.vector(keep.state)
mydata$COUNTRY <- as.vector(mydata$COUNTRY)
for (j in 1:length(keep.state)) {
  temp.data <- rbind(temp.data,subset(mydata, mydata$COUNTRY == keep.state[j]))
}

# reestablish mydata and sorting
mydata <- temp.data[order(temp.data$COUNTRY,temp.data$date_str),]

## The following section of code selects the potential peace sequences to be kept

set.seed(4422)
set.peaces <- 2 # multiplied by the number of onsets for the total number of peace sequences to be kept

# int.onset is not included because in our study we are forecasting domestic violence
sum.onset <- sum(mydata$reb.onset, na.rm=TRUE)

# Important theoretical decision was made here to ONLY select observations
# where we observe potential peace sequences across ALL conflict variables
# This can be thought of as the GTDS version of a "total peace"

total.peace <- vector(mode="numeric",length=nrow(mydata)) # a vector of zeros
total.peace[mydata$reb.potpeace==1 ] <- 1
sum.peace <- sum(total.peace, na.rm=TRUE)
chance <- set.peaces * sum.onset / sum.peace

mydata <- cbind(mydata,total.peace)


##############################################################################################################
## NOTE: Everything below this point until the expanded section is NOT necessary for baseline peace archeotype


keep.peace <- vector(mode="numeric",length=nrow(mydata)) # a vector of zeros

# a single draw from a binomial distribution of size 1... a Bernoulli...
# with the probability of drawing a 1 equal to chance

for(k in 1:nrow(mydata)) {
  if(mydata$total.peace[k] == 0) next
  keep.peace[k] <- rbinom(n=1,size=1,prob=chance)
}
mydata <- cbind(mydata,keep.peace)

mydata <- mydata[order(mydata$COUNTRY,mydata$date_str),]

# These are the observations that end the 13-week sequences
keep.obs <- vector(mode="numeric",length=nrow(mydata)) # a vector of zeros
keep.obs[mydata$keep.peace==1] <- 1
keep.obs[mydata$reb.onset==1] <- 1

mydata <- cbind(mydata,keep.obs)

#mydata$date_str[which(nchar(mydata$date_str)==6)] <- paste(substr(mydata$date_str[which(nchar(mydata$date_str)==6)], 1,5), "0", substr(mydata$date_str[which(nchar(mydata$date_str)==6)], 6,6), sep="")

mydata <- mydata[order(mydata$COUNTRY,mydata$date_str),]

mydata$month[which(mydata$month=="")] <- NA
mydata$ctryname[which(mydata$ctryname=="")] <- NA

mydata$month <- na.locf(mydata$month)
mydata$ctryname <- na.locf(mydata$ctryname)

mydata$Other_Crisis <- 0

## SECTION 2.c ## SECTION 2.c ## SECTION 2.c ## SECTION 2.c #############
#########################################################################
## ESTABLISHING SEQUENCES  ESTABLISHING SEQUENCES  ESTABLISHING SEQUENCES

## This following uses 3d arrays to form our sequences.
## Picture a draw with a stack of papers.  Each piece of paper is an 11x13 matrix.
## Each 11x13 matrix corresponds to a single sequence-worth of data.
## We have 11 variables and 13 weeks.

# Establish two separate 3-dimensional arrays.
# One is comrpised of the t-12 to t-1 week sequences preceding a true keep.obs.
# The other is comprised of the t-17 to t-6 week sequences preceding a true keep.obs.

sum.seq <- sum(mydata$keep.obs)
history <- array(0,dim=c(12,13,sum.seq))
lag.history <- array(0,dim=c(11,13,sum.seq)) # some of the higher number matrices will be empty e.g. lag.history[,,230]
count.seq <- 0
count.lagseq <- 0 # vjd 10/16/10
temp.counts <- vector(mode="numeric",length=4)
temp.week <- "1998w01"
temp.state <- "USA"
temp.crises <- 0


# Nested for loops that fill in our history array
# history[,,1] contains the data of one sequence at index 1
for(i in 1:nrow(mydata)) {
    cat(i, " \n")
  if(isTRUE(mydata$keep.obs[i]==1)) { # if the observation is either an onset OR a selected peace
    temp.type.seq <- 0
    if(isTRUE(mydata$keep.peace[i]==1)) { # the observation is a selected peace
        temp.type.seq <- 1
    }
    
    # coding other crises
    if(mydata$month[i]=="1998m1") {
        temp.crises <- 0
    } else {
        temp.ctry <- mydata$ctryname[i]
        lastmonth <- as.numeric(substr(mydata$month[i], 6, 7)) - 1
        if(lastmonth==0) {
            lastyear <- as.numeric(substr(mydata$month[i], 1, 4)) - 1
            monthmatch <- paste(lastyear, "m12", sep="")
        } else {
            monthmatch <- paste(substr(mydata$month[i], 1, 4), "m", lastmonth, sep="")
        }
        cat(monthmatch, " \n")
        t <- mydata[which(mydata$ctryname==temp.ctry & mydata$month==monthmatch), c("gtdsrebellion", "gtdsinsurgency", "gtdscrisisdomestic", "gtdsviolence")]
        temp.crises <- sum(t[1,])
    }
    
    count.seq <- count.seq+1
    count.j <- 1
    if(i<12){next} #VJD: 8/12/14
    for(j in (i-12):i) {
      temp.counts <- as.vector(c(mydata$A_verb_coop[j],mydata$A_mater_coop[j],mydata$A_verb_confl[j],mydata$A_mater_confl[j],mydata$B_verb_coop[j],mydata$B_mater_coop[j],mydata$B_verb_confl[j],mydata$B_mater_confl[j]))
      temp.week <- as.character(mydata$date_str[j])
      temp.state <- as.character(mydata$COUNTRY[j])
      history[,count.j,count.seq] <- c(temp.week,temp.state,temp.counts,temp.type.seq, temp.crises)
      count.j <- count.j+1
    } # end for j

    # for lag.history and the 5-week lag in general it is necessary to deal with an out-of-bounds issue for
    # any time that total.peace equals 1 within the first 17 weeks - at week 18 we no longer have this problem
    # by definition for there to be an onset of conflict or peace, it must be preceded by 12 weeks of peace.
    # this means i-12 never overlaps with the previous country OR goes out-of-bounds for the first country.
    # BUT, i-17 will not only go out-of-bounds for the first country, it could contain observations from the previous country. vjd 10/16

    count.k <- 1

    # if the year is greater than 1998 OR the week is greater than 17
    if(isTRUE(as.numeric(substr(mydata$date_str[i],1,4))>1998) | isTRUE(as.numeric(substr(mydata$date_str[i],6,7))>17)) {
        count.lagseq <- count.lagseq + 1
        if(i<17){next} #VJD: 8/12/14
        for(k in (i-17):(i-5)) {
            temp.counts <- as.vector(c(mydata$A_verb_coop[k],mydata$A_mater_coop[k],mydata$A_verb_confl[k],mydata$A_mater_confl[k],mydata$B_verb_coop[k],mydata$B_mater_coop[k],mydata$B_verb_confl[k],mydata$B_mater_confl[k]))
            temp.week <- as.character(mydata$date_str[k])
            temp.state <- as.character(mydata$COUNTRY[k])
            lag.history[,count.k,count.lagseq] <- c(temp.week,temp.state,temp.counts,temp.type.seq)
            count.k <- count.k+1
        } # end for k
    } # end if year is greater than 1998 or week is greater than 17
  } # end if total.peace equals 1
} # end ext for

rownames(history) <- c("time","state","A_verb_coop","A_mater_coop","A_verb_confl","A_mater_confl","B_verb_coop","B_mater_coop","B_verb_confl","B_mater_confl","seq_type", "Other_Crises")
rownames(lag.history) <- c("time","state","A_verb_coop","A_mater_coop","A_verb_confl","A_mater_confl","B_verb_coop","B_mater_coop","B_verb_confl","B_mater_confl","seq_type")

save(list=ls(), file="select_sequences_ucdp_111914.RData")


